/*******************************************************************************
*Project :          COVID-19 Research Project 

*Purpose : 		    Clean Womply Dataset for Analysis

*Source : 			Womply Data from AWS 

*******************************************************************************/

clear all 
set more off 
capture log close 

global user g1stm01 // change user accordingly 
cd "C:/Users/${user}/Dropbox (Research)/stay_at_home_project"

* generate global date
global date=subinstr(c(current_date), " ", "_", .)
set obs 1 
gen date="${date}"
split date, parse("_")
cap replace date=date3+"_"+date2+"_"+date4 if date1==""
cap replace date=date2+"_"+date1+"_"+date3 if date1!=""
global date=date[1]
display as error "Today's date = ${date}"
 

* get directory of files - change for every data update 
global datedir "29April2020"


*===============================================================================
* Append dta files 
*===============================================================================

clear
gen source = ""

set obs  1 

local files : dir "data/raw/womply/dataSTATA/${datedir}" files "*.dta" 

* append files 
foreach file in `files' {
   append using "data/raw/womply/dataSTATA/${datedir}/`file'"
   replace source = "`file'" if mi(source)
}

drop if _n ==1 

* clean missing values 	
foreach var of varlist _all {   
	 capture confirm string variable `var' 
	 if !_rc { 
		 replace `var' = "" if `var' == "\\N" 
	     replace `var' = lower(trim(`var')) 
		  destring `var' , replace 
		} 
}

* save file
save "data/raw/womply/dataSTATA/womply_data_combined.dta" , replace 


*===============================================================================
* Clean data 
*===============================================================================
	  
* load data 
use "data/raw/womply/dataSTATA/womply_data_combined.dta" , clear 
describe 
mdesc
rename *  , lower 

* name variables - using readme_womply_data.docx
  rename v1 trans_date
  rename v2 state_name 
  rename v3 county_name
  rename v4 category 
  rename v5 date_comparison
  rename v6 revenue
  rename v7 merchants
  rename v8 transactions
  rename v9 num_unique_cc
  rename v10 pre_covid_merchant_size 
  rename v11 intra_covid_active_merchants
  rename v12 intra_covid_inactive_merchants
  rename v13 average_revenue 
  rename v14 revenue_yago 
  rename v15 merchants_yago 
  rename v16 transactions_yago 
  rename v17 unique_cc_yago 
  rename v18 average_revenue_yago 
  rename v19 yoy_variance 

  replace state_name = lower(state_name) 
  replace county_name = lower(county_name) 
  replace county_name = "" if county_name == "unknown" 

* merge in county fips code 
  preserve 
	  use "data/raw/womply/dataSTATA/countyname_countyfips_xwalk.dta"  , clear
	  replace state_name = lower(state_name) 
	  replace county_name = lower(county_name) 
	  tempfile countyfips
	  save `countyfips' , replace 
  restore
  
  merge m:1 state_name county_name using `countyfips'
  tab _merge 
  keep if _merge==3
  drop _merge 
  
* generate county fips - corrected format 
  replace county_fips = state_fips + county_fips 
  destring county_fips , replace 
  format county_fips %05.0f
  
* generate date variable 
  g date = date(trans_date, "YMD") 
  format date %td 
  
* keep only relevant variables 
  keep date revenue county_name state_name county_fips category revenue pre_covid_merchant_size ///
  intra_covid_active_merchants intra_covid_inactive_merchants merchants 
  order date category *_name county_fips revenue
  sort date category county_fips
  
* gen %firms with 0 revenue 
  g share_firms_no_rev = (intra_covid_inactive_merchants)/(intra_covid_active_merchants + intra_covid_inactive_merchants) 

 * save dataset 
  compress 
  save "data/clean/womply/womply_daily_revenue.dta" , replace 

 
